Data: The first look

The two data sets used in this analysis are the Madison case data sourced from the Wisconsin DHS and wastewater concentration data produced by the Wisconsin State Laboratory of Hygiene. This wastewater data has entries every couple of days from November 15 2020 to June 24 2021.

“Files Used:”

Z:/COVID-19_WastewaterAnalysis/data/processed/MMSD_Interceptor_Cases_2_7_22.csv

Z:/COVID-19_WastewaterAnalysis/data/processed/LIMSWasteData_02-09-22.csv

Date Site Cases SevenDayMACases EpisodeCase SpecCollectedCase N1 BCoV N2 PMMoV GeoMeanN12 FlowRate temperature equiv_sewage_amt sample_id cov2_conc BCoVConc capacity_mgd composite_freq concentration_method conductivity county do epaid extraction_method inhibition_adjust inhibition_detect inhibition_method lod_ref n1_lod n1_loq n1_ntc_amplify n1_num_no_target_control n1_num_ntc_amplify N1Error n1_sars_cov2_lod n1_sars_cov2_units n2_lod n2_loq n2_ntc_amplify n2_num_no_target_control n2_num_ntc_amplify N2Error n2_sars_cov2_lod n2_sars_cov2_units pcr_type ph Pop quant_stan_ref quant_stan_type sample_collect_time sample_location sample_location_specify sample_matrix sample_type state RepDate tss wwtp_comments zipcode analytical_comments unformatted_collectdate mycustomer AVG wt N1Pure N2Pure
2020-09-11 MMSD-P11 10 42.85714 18 22 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
2020-09-11 MMSD-P18 9 21.42857 9 11 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
2020-09-11 MMSD-P2 107 23.28571 92 195 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
2020-09-11 MMSD-P7 2 22.85714 2 1 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
2020-09-11 MMSD-P8 10 41.00000 8 8 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
2020-09-12 MMSD-P11 15 40.42857 14 17 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA

The case data has a strong weekend effect so for this section we look at a seven day smoothing of cases. The simple display of the data shows the core components of this story. First that wastewater data is very noisy. And that there is a hint of a relationship between the two signals.

FirstImpressionDF <- FullDF%>%
  NoNa("N1","Cases")

FirstImpression <- FirstImpressionDF%>%#Removing outliers
  ggplot(aes(x=Date))+#Data depends on time
  geom_line(aes(y=N1, color="N1",info=N1))+#compares N1 to Cases
  geom_line(aes(y=MinMaxFixing(PMMoV,N1), color="PMMoV",info=PMMoV))+
  geom_line(aes(y=MinMaxFixing(FlowRate,N1), color="FlowRate",info=FlowRate))+
  geom_line(aes(y=MinMaxFixing(BCoV,N1), color="BCoV",info=BCoV))+
  #geom_line(aes(y=MinMaxFixing(temperature,N1), color="temperature",info=temperature))+
  #geom_line(aes(y=MinMaxFixing(tss,N1), color="tss",info=tss))+
  geom_line(aes(y=MinMaxFixing(equiv_sewage_amt,N1), color="equiv_sewage_amt",info=equiv_sewage_amt))+
  geom_line(aes(y=MinMaxFixing(SevenDayMACases,N1), color="Seven Day MA Cases",info=Cases))+
  geom_line(aes(y=MinMaxFixing(N2,N1), color="N2",info=N2))+
  labs(y="N1")



ggplotly(FirstImpression)
#To remove weekend effects we are looking at the 7 day smoothing of cases.
DaySmoothed=21#Very wide smoothing to find where the data strong deviate from trend

ErrorMarkedDF <- FullDF%>%#Remove older Var data that clearly has no relationship to Cases
  mutate(EarlyOutlier = ifelse(Date < mdy("11/20/2020"),TRUE,FALSE))#replace old. 
#"11/20/2020"
#"1/1/2021"

ErrorMarkedDF$DetectedOutlierN1 <- IdentifyOutliers(ErrorMarkedDF$N1)
ErrorMarkedDF$DetectedOutlierN2 <- IdentifyOutliers(ErrorMarkedDF$N2)  
ErrorMarkedDF$DetectedOutlierGeo <- IdentifyOutliers(ErrorMarkedDF$GeoMeanN12)
ErrorMarkedDF$DetectedOutlierPMMOV <- IdentifyOutliers(ErrorMarkedDF$PMMoV)
ErrorMarkedDF$DetectedOutlierBCOv <- IdentifyOutliers(ErrorMarkedDF$BCoV)

ErrorRemovedDF <- ErrorMarkedDF%>%
  mutate(PotentialOutlier = Date>mdy("03/31/2021")&mdy("05/12/2021")>Date)%>%
         select(-EarlyOutlier)#Removing unneeded calculated columns


ErrorMarkedDF%>%
  select(-EarlyOutlier)%>%
  #filter(DetectedOutlierN1|DetectedOutlierN2|DetectedOutlierGeo)%>%
  mutate(NumberOfOutliers = DetectedOutlierN1+DetectedOutlierN2+DetectedOutlierGeo)%>%
  group_by(NumberOfOutliers)%>%
  summarize(n())
## # A tibble: 4 x 2
##   NumberOfOutliers `n()`
##              <int> <int>
## 1                0  7856
## 2                1    93
## 3                2    86
## 4                3   206
ErrorMarkedDF%>%
  select(-EarlyOutlier)%>%
  summarize(N1 = sum(DetectedOutlierN1), N2 = sum(DetectedOutlierN2), Geo = sum(DetectedOutlierGeo), N12 =sum(DetectedOutlierN1*DetectedOutlierN2), N1Geo =sum(DetectedOutlierN1*DetectedOutlierGeo), N2Geo =sum(DetectedOutlierN2*DetectedOutlierGeo), AllVars =sum(DetectedOutlierN1*DetectedOutlierN2*DetectedOutlierGeo))
##    N1  N2 Geo N12 N1Geo N2Geo AllVars
## 1 304 292 287 217   242   245     206
OutlierComp <- ErrorMarkedDF%>%
  filter(DetectedOutlierN1|DetectedOutlierN2|DetectedOutlierGeo)%>%
  mutate(Name = "",Name = paste(Name,ifelse(DetectedOutlierN1,"N1","")),
         Name = paste(Name,ifelse(DetectedOutlierN2,"N2","")),
         Name = paste(Name,ifelse(DetectedOutlierGeo,"Geo","")))%>%
  ggplot(aes(x=Date))+
  geom_point(aes(y=N1,shape="N1",color=Name))+
  geom_point(aes(y=N2,shape="N2",color=Name))+
  geom_point(aes(y=GeoMeanN12,shape="Geo",color=Name))
ggplotly(OutlierComp)
#,info1=N1,info2=GeoMeanN12

OutlierComp <- ErrorMarkedDF%>%
  filter(DetectedOutlierN1|DetectedOutlierPMMOV|DetectedOutlierBCOv)%>%
  mutate(Name = "",Name = paste(Name,ifelse(DetectedOutlierN1,"N1","")),
         Name = paste(Name,ifelse(DetectedOutlierPMMOV,"PMMoV","")),
         Name = paste(Name,ifelse(DetectedOutlierBCOv,"BCoV","")))%>%
  ggplot(aes(x=Date))+
  geom_point(aes(y=N1,color=Name,info1=PMMoV,info2=BCoV))
ggplotly(OutlierComp)
#ErrorMarkedDF%>%
#  select(-EarlyOutlier)%>%
#  write.csv(file="OutlierFlagDF")
#wwtp_comments
#analytical_comments
#ErrorRemovedDF%>%
#  filter(!is.na(analytical_comments))
CoVarGraphic <- ErrorRemovedDF%>%
  ggplot()+#LargeError
  aes(y = N1,color=DetectedOutlier,info = Date)
#PotentialOutlier#DetectedOutlier
CoVarGraphicPMMoV <- CoVarGraphic+
  geom_point(aes(x = PMMoV))
#Outliers
#26203622#29931423
#149752.0#461781.5
#Potintial outlier
#26463265#30048370
#172041.0#98442.5
CoVarGraphicFlowRate <- CoVarGraphic+
  geom_point(aes(x = FlowRate))
CoVarGraphicBCoV <- CoVarGraphic+
  geom_point(aes(x = BCoV))
CoVarGraphicequiv_sewage_amt <- CoVarGraphic+#temperature,equiv_sewage_amt
  geom_point(aes(x = equiv_sewage_amt))
#PMMoV,FlowRate,BCoV,temperature,equiv_sewage_amt
ggplotly(CoVarGraphicPMMoV)
ggplotly(CoVarGraphicBCoV)
ggplotly(CoVarGraphicFlowRate)
ggplotly(CoVarGraphicequiv_sewage_amt)

ErrorRemovedDF%>%
  group_by(DetectedOutlier)%>%
  summarize(median(PMMoV,na.rm = TRUE),sd(PMMoV,na.rm = TRUE),
            median(N1,na.rm = TRUE),sd(N1,na.rm = TRUE))


AA <- ErrorRemovedDF%>%
  ggplot(aes(x=Date,y=equiv_sewage_amt))+
  geom_point()
ggplotly(AA)
summary(lm(DetectedOutlier ~ PMMoV ,data = ErrorMarkedDF))

library(rpart)
fit <- rpart(DetectedOutlier ~ PMMoV + N1,
   method="class", data=ErrorMarkedDF)

printcp(fit)
plotcp(fit)
summary(fit)
plot(fit)
text(fit, use.n=TRUE, all=TRUE, cex=.5)

CoVarGraphicPMMoV <- CoVarGraphic+
  geom_point(aes(x = PMMoV))+
  geom_hline(yintercept = 5.61e5)+
  geom_vline(xintercept = 2.82e7)
ggplotly(CoVarGraphicPMMoV)


ErrorMarkedDF%>%
  ggplot(aes(x = DetectedOutlier, y = log(PMMoV))) +
        geom_boxplot()


t.test(PMMoV ~ DetectedOutlier, data = ErrorMarkedDF)

#26203622#29931423
#149752.0#461781.5
LoessFunc <- function(DF,SpanConstant = .01,Var="N1"){
  MainDF <- DF
  MainDF[[paste0("loess",Var)]] <- loessFit(y=(MainDF[[Var]]), 
                      x=MainDF$Date, #create loess fit of the data
                      span=SpanConstant, #span of .2 seems to give the best result, not rigorously chosen
                      iterations=2)$fitted#2 iterations remove some bad patterns
  return(MainDF)
}
LoessN1Messurement <- FullDF%>%
  group_split(Site) %>%
  purrr::map_dfr(LoessFunc)

ErrorCompPlot <- function(ErrorVar="N1",CompVar="N2"){
  ErrorPlotDF <- FullDF%>%
    group_split(Site) %>%
    purrr::map_dfr(LoessFunc,Var = ErrorVar)%>%
    filter(!is.na(!!sym(paste0("loess",ErrorVar))))%>%
    mutate(TrendError = abs(!!sym(ErrorVar)-!!sym(paste0("loess",ErrorVar))))
  
    NoLogPlot <- ErrorPlotDF%>%
      ggplot(aes(y=TrendError,x=!!sym(CompVar),info=Date))+
      geom_point()
    
    LogPlot <- ErrorPlotDF%>%
      ggplot(aes(y=TrendError,x=!!sym(CompVar),info=Date))+
      geom_point()+
      scale_y_log10()+
      scale_x_log10()
    #print(ggplotly(NoLogPlot))
    #print(ggplotly(LogPlot))
    print((NoLogPlot))
    print((LogPlot))
    Formula <- as.formula(
        paste("TrendError", 
        paste(CompVar, collapse = " + "), 
        sep = " ~ "))

    fit <- lm(Formula, data = ErrorPlotDF)
    print(summary(fit)$coefficients[,4]) 
    print(summary(fit)$r.squared)
  return()
}
PlotVarNames <- LoessN1Messurement%>%
  select_if(function(col) length(unique(col))>1)%>%
  names()

#names(LoessN1Messurement)
for (Name in PlotVarNames){
  try(
    print(ErrorCompPlot(CompVar = Name))
  )
}

## Error in Math.Date(x, base) : log not defined for "Date" objects

## Error in log(x, base) : non-numeric argument to mathematical function

## (Intercept)       Cases 
##  0.02940312  0.01208053 
## [1] 0.00763837
## NULL

##     (Intercept) SevenDayMACases 
##    1.856535e-01    7.167533e-06 
## [1] 0.02219889
## NULL

##  (Intercept)  EpisodeCase 
## 0.2879335219 0.0001109867 
## [1] 0.01755592
## NULL

##       (Intercept) SpecCollectedCase 
##      0.2985190617      0.0003453027 
## [1] 0.01539464
## NULL

##  (Intercept)           N1 
## 2.242998e-03 1.801557e-81 
## [1] 0.06198631
## NULL

## (Intercept)        BCoV 
## 0.003978898 0.776838973 
## [1] 1.417523e-05
## NULL

##  (Intercept)           N2 
## 6.260085e-01 3.744956e-40 
## [1] 0.03126069
## NULL

##  (Intercept)        PMMoV 
## 4.451174e-05 9.699300e-01 
## [1] 2.488381e-07
## NULL

##  (Intercept)   GeoMeanN12 
## 9.103091e-02 2.554871e-59 
## [1] 0.04513622
## NULL

## (Intercept)    FlowRate 
## 8.25086e-02 1.15294e-20 
## [1] 0.01540143
## NULL

## (Intercept) temperature 
##   0.1592601   0.7395512 
## [1] 4.829313e-05
## NULL

##      (Intercept) equiv_sewage_amt 
##      0.001315222      0.083590044 
## [1] 0.0005240121
## NULL

## (Intercept)   sample_id 
##  0.09342473  0.06506079 
## [1] 0.0005957142
## NULL

##  (Intercept)    cov2_conc 
## 8.661648e-02 3.853731e-59 
## [1] 0.04501469
## NULL

## (Intercept)    BCoVConc 
##  0.03524225  0.10355942 
## [1] 0.000463843
## NULL

##  (Intercept) capacity_mgd 
## 5.870041e-01 3.402299e-10 
## [1] 0.007005544
## NULL

## Error in log(x, base) : non-numeric argument to mathematical function

## Error in log(x, base) : non-numeric argument to mathematical function

##  (Intercept) conductivity 
## 1.328075e-01 4.317016e-05 
## [1] 0.004228313
## NULL

## Error in log(x, base) : non-numeric argument to mathematical function

## Error in log(x, base) : non-numeric argument to mathematical function

## Error in log(x, base) : non-numeric argument to mathematical function

## Error in log(x, base) : non-numeric argument to mathematical function

## Error in log(x, base) : non-numeric argument to mathematical function

## Error in log(x, base) : non-numeric argument to mathematical function

## Error in log(x, base) : non-numeric argument to mathematical function

## Error in log(x, base) : non-numeric argument to mathematical function

## (Intercept)      n1_lod 
##   0.2367700   0.2765731 
## [1] 0.0002072538
## NULL

## (Intercept)      n1_loq 
##   0.2400919   0.2924249 
## [1] 0.0001940499
## NULL

## Error in log(x, base) : non-numeric argument to mathematical function

##              (Intercept) n1_num_no_target_control 
##                0.2995248                0.7217732 
## [1] 2.220059e-05
## NULL

## [1] NaN
## [1] 0
## NULL

##   (Intercept)       N1Error 
##  1.616777e-20 1.593410e-230 
## [1] 0.1680539
## NULL

## Error in log(x, base) : non-numeric argument to mathematical function

## (Intercept)      n2_lod 
##   0.2175018   0.3148867 
## [1] 0.0001768324
## NULL

## (Intercept)      n2_loq 
##   0.2486691   0.2765708 
## [1] 0.0002072557
## NULL

## Error in log(x, base) : non-numeric argument to mathematical function

##              (Intercept) n2_num_no_target_control 
##                0.2995248                0.7217732 
## [1] 2.220059e-05
## NULL

##   (Intercept)       N2Error 
##  1.912971e-10 7.804265e-129 
## [1] 0.0970591
## NULL

## Error in log(x, base) : non-numeric argument to mathematical function

## Error in log(x, base) : non-numeric argument to mathematical function

## Error in log(x, base) : non-numeric argument to mathematical function

##  (Intercept)           ph 
## 0.0001011119 0.9449674492 
## [1] 8.992814e-07
## NULL

##  (Intercept)          Pop 
## 9.156992e-01 6.371909e-11 
## [1] 0.00758504
## NULL

## Error in log(x, base) : non-numeric argument to mathematical function

## Error in log(x, base) : non-numeric argument to mathematical function

## Error in log(x, base) : non-numeric argument to mathematical function

## Error in log(x, base) : non-numeric argument to mathematical function

## Error in log(x, base) : non-numeric argument to mathematical function

## Error in log(x, base) : non-numeric argument to mathematical function

## Error in log(x, base) : non-numeric argument to mathematical function

## Error in log(x, base) : non-numeric argument to mathematical function

## Error in log(x, base) : non-numeric argument to mathematical function

## (Intercept)         tss 
##         NaN         NaN 
## [1] NaN
## NULL

## Error in log(x, base) : non-numeric argument to mathematical function

## Error in log(x, base) : non-numeric argument to mathematical function

## Error in log(x, base) : non-numeric argument to mathematical function

## Error in log(x, base) : non-numeric argument to mathematical function

## (Intercept)  mycustomer 
##   0.5796107   0.5795374 
## [1] 5.374681e-05
## NULL

##  (Intercept)          AVG 
## 9.103090e-02 2.554866e-59 
## [1] 0.04513622
## NULL

## (Intercept)          wt 
##   0.7155899   0.4712583 
## [1] 9.086939e-05
## NULL

##  (Intercept)       N1Pure 
## 2.242998e-03 1.801557e-81 
## [1] 0.06198631
## NULL

##  (Intercept)       N2Pure 
## 6.229319e-01 1.855587e-41 
## [1] 0.03134785
## NULL

## (Intercept)     loessN1 
## 0.004266055 0.010469474 
## [1] 0.001146714
## NULL
ModelDF <- FullDF%>%
    group_split(Site) %>%
    purrr::map_dfr(LoessFunc,Var = "N1")%>%
    filter(!is.na(loessN1))


#fmla <- as.formula(paste0("loessN1 ~ ", paste(PlotVarNames[c(3:56,66)], collapse= " + ")))
#summary(lm(fmla,data = ModelDF))
#names(LoessN1Messurement)